# R script for cleaning raw census data downloaded from the IPUMS-USA online repository
# Preliminary Step: Supplementary procedures to the main analysis

# NOTE I:
## Due to IPUMS full-count data sharing policies,
## full-count samples cannot be shared or redistributed.
## For transparency and reproducibility,
## we provide the following sample code for cleaning raw data,
## including both full-count and linked samples.
## The code can be used to prepare the downloaded data in the same way we did,
## and generate the XLSX outputs that serve as inputs for subsequent analysis.

# NOTE II:
## Please ensure you substitute your own choice of filenames for the placeholders ('XXX')
## in all sections of the sample code below.
## The summary outputs used as inputs for later analysis are assigned fixed filenames
## to ensure smooth execution of the subsequent code that produces the final results.

# START SCRIPT

rm(list = ls())
run_sample_code <- FALSE # default: off (demo only)

# Load pkgs ---------------------------------------------------------------

if (!require('ipumsr')) install.packages('ipumsr')
if (!require('htmltools')) install.packages('htmltools')
if (!require('shiny')) install.packages('shiny')
if (!require('DT')) install.packages('DT')
if (!require('data.table')) install.packages("data.table")
if (!require('openxlsx')) install.packages('openxlsx')
if (!require('readxl')) install.packages('readxl')
if (!require('writexl')) install.packages('writexl')
if (!require('tidyverse')) install.packages('tidyverse')
if (!require('haven')) install.packages('haven')

# SAMPLE CODE: prepare full-count census, 1850-1880 -----------------------

# The input is a combined full-count sample covering 1850, 1860, 1870, and 1880
# Downloaded from the IPUMS data portal in XML and DAT.GZ formats
# The output is a single dataset with stacked observations from all four years

# WARNING: VERY LONG RUN TIME EXPECTED. MAY BE INTERRUPTED BY LOCAL MACHINE. CONSIDER INCREASING MEMORY OR RUN TIME LIMIT.

if (run_sample_code) {
  
  # read codebook
  ddi <- ipumsr::read_ipums_ddi("XXX.xml") # enter your own xml file name
  # ipumsr::ipums_view(ddi)
  
  # extract data in chunks
  chunk_size = 5000 # recommended procedure. large data.
  dat_chunked <- ipumsr::read_ipums_micro_chunked(ddi, callback = cbobj_selectVars, chunk_size = chunk_size, verbose = TRUE)
  class(dat_chunked) # "tbl_df", "tbl", "data.frame"
  dim(dat_chunked)
  
  # convert to data.table
  data.table::setDT(dat_chunked)
  class(dat_chunked) # "data.table", "data.frame"
  
  # keep 1850-1880 full counts only (in case)
  dat_1850TO1880 = dat_chunked[SAMPLE %in% c(185002, 186003, 187003, 188003)]
  dat_1850TO1880[,unique(YEAR)] # 1850, 1860, 1870, 1880
  dim(dat_1850TO1880)
  
  # export data for later usage
  saveRDS(dat_1850TO1880, "XXX.RDS") # enter your own preferred RDS filename
  
  # clear data objs
  rm(list = setdiff(ls(), 'run_sample_code'))
  
  # force garbage collection
  gc()
  
}

# SAMPLE CODE: further prepare full-count census, 1850-1880 ---------------

# The cleaned full-count samples are further prepared for state-level analysis

if (run_sample_code) {
  
  # read cleaned full-count samples
  census_1850to80 = readRDS("XXX.RDS") # enter your chosen RDS filename from before
  
  full_1850 = census_1850to80[YEAR==1850,]
  full_1860 = census_1850to80[YEAR==1860,]
  full_1870 = census_1850to80[YEAR==1870,]
  full_1880 = census_1850to80[YEAR==1880,]
  
  # label states
  label_state_names = function(fip) { 
    nm = NA_character_
    nm[fip==1] = 'Alabama'
    nm[fip==2] = 'Alaska'
    nm[fip==4] = 'Arizona'
    nm[fip==5] = 'Arkansas'
    nm[fip==6] = 'California'
    nm[fip==8] = 'Colorado'
    nm[fip==9] = 'Connecticut'
    nm[fip==10] = 'Delaware'
    nm[fip==11] = 'District of Columbia'
    nm[fip==12] = 'Florida'
    nm[fip==13] = 'Georgia'
    nm[fip==15] = 'Hawaii'
    nm[fip==16] = 'Idaho'
    nm[fip==17] = 'Illinois'
    nm[fip==18] = 'Indiana'
    nm[fip==19] = 'Iowa'
    nm[fip==20] = 'Kansas'
    nm[fip==21] = 'Kentucky'
    nm[fip==22] = 'Louisiana'
    nm[fip==23] = 'Maine'
    nm[fip==24] = 'Maryland'
    nm[fip==25] = 'Massachusetts'
    nm[fip==26] = 'Michigan'
    nm[fip==27] = 'Minnesota'
    nm[fip==28] = 'Mississippi'
    nm[fip==29] = 'Missouri'
    nm[fip==30] = 'Montana'
    nm[fip==31] = 'Nebraska'
    nm[fip==32] = 'Nevada'
    nm[fip==33] = 'New Hampshire'
    nm[fip==34] = 'New Jersey'
    nm[fip==35] = 'New Mexico'
    nm[fip==36] = 'New York'
    nm[fip==37] = 'North Carolina'
    nm[fip==38] = 'North Dakota'
    nm[fip==39] = 'Ohio'
    nm[fip==40] = 'Oklahoma'
    nm[fip==41] = 'Oregon'
    nm[fip==42] = 'Pennsylvania'
    nm[fip==44] = 'Rhode Island'
    nm[fip==45] = 'South Carolina'
    nm[fip==46] = 'South Dakota'
    nm[fip==47] = 'Tennessee'
    nm[fip==48] = 'Texas'
    nm[fip==49] = 'Utah'
    nm[fip==50] = 'Vermont'
    nm[fip==51] = 'Virginia'
    nm[fip==53] = 'Washington'
    nm[fip==54] = 'West Virginia'
    nm[fip==55] = 'Wisconsin'
    nm[fip==56] = 'Wyoming'
    return(nm)
  }
  
  full_1850[,"STATE":=label_state_names(STATEFIP)]
  full_1860[,"STATE":=label_state_names(STATEFIP)]
  full_1870[,"STATE":=label_state_names(STATEFIP)]
  full_1880[,"STATE":=label_state_names(STATEFIP)]
  
  # add (new) demographic indicators
  add_new_indicators = function(dt) {
    dt[, CONFEDERATE := as.integer(STATEFIP %in% c(1, 5, 12, 13, 22, 28, 37, 45, 47, 48, 51))] 
    dt[, BORDER := as.integer(STATEFIP %in% c(10, 21, 24, 29, 54))]
    dt[, UNION := as.integer(STATEFIP %in% c(23, 36, 33, 50, 25, 9, 44, 42, 34, 39, 18, 17, 20, 26, 55, 27, 19, 6, 32, 41))]
    dt[, MALE := as.integer(SEX==1)][, FEMALE := as.integer(SEX==2)]
    dt[, WHITE := as.integer(RACE==1)][, BLACK:=as.integer(RACE==2)]
    dt[, NATIVEBORN := as.integer(BPL<100)][, FOREIGNBORN := as.integer(BPL>=100 & BPL!=999)] 
    dt[,"5-14":=as.integer(AGE>=5&AGE<=14)][,"15-24":=as.integer(AGE>=15&AGE<=24)][,"25-34":=as.integer(AGE>=25&AGE<=34)][,"35-44":=as.integer(AGE>=35&AGE<=44)][,"45-54":=as.integer(AGE>=45&AGE<=54)]
    dt[,"<5":=as.integer(AGE<=4)][,">54":=as.integer(AGE>=55)] 
  }
  
  full_1850_plus = add_new_indicators(full_1850)
  full_1860_plus = add_new_indicators(full_1860)
  full_1870_plus = add_new_indicators(full_1870)
  full_1880_plus = add_new_indicators(full_1880)
  
  # select cases by conditions
  Full_1850 = full_1850_plus[UNION==1 | BORDER ==1 | CONFEDERATE==1, ][WHITE==1 & NATIVEBORN==1,][`<5`!=1 & `>54`!=1,] 
  Full_1860 = full_1860_plus[UNION==1 | BORDER ==1 | CONFEDERATE==1, ][WHITE==1 & NATIVEBORN==1,][`<5`!=1 & `>54`!=1,]
  Full_1870 = full_1870_plus[UNION==1 | BORDER ==1 | CONFEDERATE==1, ][WHITE==1 & NATIVEBORN==1,][`<5`!=1 & `>54`!=1,]
  Full_1880 = full_1880_plus[UNION==1 | BORDER ==1 | CONFEDERATE==1, ][WHITE==1 & NATIVEBORN==1,][`<5`!=1 & `>54`!=1,]
  
  # select key variables
  FULL_1850 = Full_1850[,dplyr::select(.SD, YEAR, STATEFIP, STATE, COUNTYNHG, UNION, BORDER, CONFEDERATE, MALE, FEMALE, WHITE, NATIVEBORN, `5-14`:`45-54`)]
  FULL_1860 = Full_1860[,dplyr::select(.SD, YEAR, STATEFIP, STATE, COUNTYNHG, UNION, BORDER, CONFEDERATE, MALE, FEMALE, WHITE, NATIVEBORN, `5-14`:`45-54`)]
  FULL_1870 = Full_1870[,dplyr::select(.SD, YEAR, STATEFIP, STATE, COUNTYNHG, UNION, BORDER, CONFEDERATE, MALE, FEMALE, WHITE, NATIVEBORN, `5-14`:`45-54`)]
  FULL_1880 = Full_1880[,dplyr::select(.SD, YEAR, STATEFIP, STATE, COUNTYNHG, UNION, BORDER, CONFEDERATE, MALE, FEMALE, WHITE, NATIVEBORN, `5-14`:`45-54`)]
  
  # save cleaned full-counts
  FULL_1850to80 = list("1850"=FULL_1850, "1860"=FULL_1860, "1870"=FULL_1870, "1880"=FULL_1880)
  saveRDS(FULL_1850to80, "XXX.RDS") # enter your own preferred RDS filename here
  
  # clean-ups and garbage removals
  rm(list = setdiff(ls(), 'run_sample_code'))
  gc()
  
}

# SAMPLE CODE: prepare linked census, 1850-1880 ---------------------------

# The input consists of four separate linked censuses covering 1850-1880
# Downloaded from the IPUMS data portal in XML and DAT.GZ formats (one pair per year)
# The output is a named list of three datasets with linked observations for each period (e.g., 1850-1860)

# WARNING: EXPECT HIGH MEMORY USAGE AND MODERATELY LONG RUN TIMES.

if (run_sample_code) {
  
  # extract linked samples
  ddi_1850 <- ipumsr::read_ipums_ddi("XXX.xml") # enter your own xml filename for 1850 linked census
  ddi_1860 <- ipumsr::read_ipums_ddi("XXX.xml") # enter your own xml filename for 1860 linked census
  ddi_1870 <- ipumsr::read_ipums_ddi("XXX.xml") # enter your own xml filename for 1870 linked census
  ddi_1880 <- ipumsr::read_ipums_ddi("XXX.xml") # enter your own xml filename for 1880 linked census
  
  dat_1850 <- ipumsr::read_ipums_micro(ddi_1850)
  data.table::setDT(dat_1850)
  dat_1860 <- ipumsr::read_ipums_micro(ddi_1860)
  data.table::setDT(dat_1860)
  dat_1870 <- ipumsr::read_ipums_micro(ddi_1870)
  data.table::setDT(dat_1870)
  dat_1880 <- ipumsr::read_ipums_micro(ddi_1880)
  data.table::setDT(dat_1880)
  
  # select linked cases by period
  select_linked_cases = function(lc1, lc2, link_flag1, link_flag2) {
    hik_lkd = intersect(lc1$HIK, lc2$HIK)
    lc1_lkd = lc1[HIK %in% hik_lkd,]
    lc2_lkd = lc2[HIK %in% hik_lkd,]
    lc1_lkd[, (link_flag1) := NULL]
    lc2_lkd[, (link_flag2) := NULL]
    return(list(lc1_lkd,lc2_lkd))
  }
  
  linked_1850and1860 <- select_linked_cases(dat_1850, dat_1860, "LINK1850", "LINK1860")
  nrow(linked_1850and1860[[1]]) == nrow(linked_1850and1860[[2]]) # must be TRUE
  linked_1860and1870 <- select_linked_cases(dat_1860, dat_1870, "LINK1860", "LINK1870")
  nrow(linked_1860and1870[[1]]) == nrow(linked_1860and1870[[2]]) # must be TRUE
  linked_1870and1880 <- select_linked_cases(dat_1870, dat_1880, "LINK1870", "LINK1880")
  nrow(linked_1870and1880[[1]]) == nrow(linked_1870and1880[[2]]) # must be TRUE
  
  # remove codebook and raw data
  rm(dat_1850, dat_1860, dat_1870, dat_1880)
  rm(ddi_1850, ddi_1860, ddi_1870, ddi_1880)
  
  ## label states
  label_state_names = function(fip) { 
    nm = NA_character_
    nm[fip==1] = 'Alabama'
    nm[fip==2] = 'Alaska'
    nm[fip==4] = 'Arizona'
    nm[fip==5] = 'Arkansas'
    nm[fip==6] = 'California'
    nm[fip==8] = 'Colorado'
    nm[fip==9] = 'Connecticut'
    nm[fip==10] = 'Delaware'
    nm[fip==11] = 'District of Columbia'
    nm[fip==12] = 'Florida'
    nm[fip==13] = 'Georgia'
    nm[fip==15] = 'Hawaii'
    nm[fip==16] = 'Idaho'
    nm[fip==17] = 'Illinois'
    nm[fip==18] = 'Indiana'
    nm[fip==19] = 'Iowa'
    nm[fip==20] = 'Kansas'
    nm[fip==21] = 'Kentucky'
    nm[fip==22] = 'Louisiana'
    nm[fip==23] = 'Maine'
    nm[fip==24] = 'Maryland'
    nm[fip==25] = 'Massachusetts'
    nm[fip==26] = 'Michigan'
    nm[fip==27] = 'Minnesota'
    nm[fip==28] = 'Mississippi'
    nm[fip==29] = 'Missouri'
    nm[fip==30] = 'Montana'
    nm[fip==31] = 'Nebraska'
    nm[fip==32] = 'Nevada'
    nm[fip==33] = 'New Hampshire'
    nm[fip==34] = 'New Jersey'
    nm[fip==35] = 'New Mexico'
    nm[fip==36] = 'New York'
    nm[fip==37] = 'North Carolina'
    nm[fip==38] = 'North Dakota'
    nm[fip==39] = 'Ohio'
    nm[fip==40] = 'Oklahoma'
    nm[fip==41] = 'Oregon'
    nm[fip==42] = 'Pennsylvania'
    nm[fip==44] = 'Rhode Island'
    nm[fip==45] = 'South Carolina'
    nm[fip==46] = 'South Dakota'
    nm[fip==47] = 'Tennessee'
    nm[fip==48] = 'Texas'
    nm[fip==49] = 'Utah'
    nm[fip==50] = 'Vermont'
    nm[fip==51] = 'Virginia'
    nm[fip==53] = 'Washington'
    nm[fip==54] = 'West Virginia'
    nm[fip==55] = 'Wisconsin'
    nm[fip==56] = 'Wyoming'
    return(nm)
  }

  label_state_names_dt = function(dt) {dt[,"STATE":=label_state_names(STATEFIP)]; return(dt)}

  linked_1850and1860 = lapply(linked_1850and1860, label_state_names_dt)
  linked_1860and1870 = lapply(linked_1860and1870, label_state_names_dt)
  linked_1870and1880 = lapply(linked_1870and1880, label_state_names_dt)

  # add demographic indicators
  add_new_indicators = function(dt) {
    dt[, CONFEDERATE := as.integer(STATEFIP %in% c(1, 5, 12, 13, 22, 28, 37, 45, 47, 48, 51))] 
    dt[, BORDER := as.integer(STATEFIP %in% c(10, 21, 24, 29, 54))]
    dt[, UNION := as.integer(STATEFIP %in% c(23, 36, 33, 50, 25, 9, 44, 42, 34, 39, 18, 17, 20, 26, 55, 27, 19, 6, 32, 41))]
    dt[, MALE := as.integer(SEX==1)][, FEMALE := as.integer(SEX==2)]
    dt[, WHITE := as.integer(RACE==1)][, BLACK:=as.integer(RACE==2)]
    dt[, NATIVEBORN := as.integer(BPL<100)][, FOREIGNBORN := as.integer(BPL>=100 & BPL!=999)] 
    dt[,"5-14":=as.integer(AGE>=5&AGE<=14)][,"15-24":=as.integer(AGE>=15&AGE<=24)][,"25-34":=as.integer(AGE>=25&AGE<=34)][,"35-44":=as.integer(AGE>=35&AGE<=44)][,"45-54":=as.integer(AGE>=45&AGE<=54)]
    dt[,"<5":=as.integer(AGE<=4)][,">54":=as.integer(AGE>=55)] 
  }
  
  linked_1850and1860 = lapply(linked_1850and1860, add_new_indicators)
  linked_1860and1870 = lapply(linked_1860and1870, add_new_indicators)
  linked_1870and1880 = lapply(linked_1870and1880, add_new_indicators)
  
  # select key variables
  select_linked_vars <- function(dt) {
    dt[, dplyr::select(.SD, YEAR, HIK, STATEFIP, STATE, COUNTYNHG, SEX, AGE, RACE, BPL, CONFEDERATE, UNION, BORDER, MALE, FEMALE, WHITE, NATIVEBORN, `5-14`:`45-54`)]
  }
  
  linked_1850and1860 <- lapply(linked_1850and1860, select_linked_vars)
  linked_1860and1870 <- lapply(linked_1860and1870, select_linked_vars)
  linked_1870and1880 <- lapply(linked_1870and1880, select_linked_vars)
  
  # drop false matches (in case)
  drop_false_matches <- function(dt_ls) {
    
    dt2 = merge(
      dt_ls[[1]][,dplyr::select(.SD, HIK, MALE1=MALE,WHITE1=WHITE,NATIVEBORN1=NATIVEBORN, AGE1=AGE)],
      dt_ls[[2]][,dplyr::select(.SD, HIK, MALE2=MALE,WHITE2=WHITE,NATIVEBORN2=NATIVEBORN, AGE2=AGE)],
      by = "HIK"
    )
      
    # filter by matches in all key dimensions
    dt_matched = dt2[(MALE1==MALE2 & WHITE1==WHITE2 & NATIVEBORN1==NATIVEBORN2 & (AGE2>=AGE1+5) & (AGE2<=AGE1+15)), ]
    
    # extract keys for matched cases
    key_matched = unique(dt_matched$HIK) 
    
    # filter by matched cases
    dt_matched1 = dt_ls[[1]][HIK %in% key_matched,]
    dt_matched2 = dt_ls[[2]][HIK %in% key_matched,]
    
    # increment T1 cohorts for T2 cases 
    DF_matched1 = dt_matched1[,dplyr::select(.SD, YEAR, HIK, STATEFIP, STATE, COUNTYNHG, CONFEDERATE, UNION, BORDER, MALE, FEMALE, WHITE, NATIVEBORN, `5-14`:`35-44`)]
    cohort1to2 = DF_matched1[,.(HIK, `15-24`=`5-14`, `25-34`=`15-24`, `35-44`=`25-34`, `45-54`=`35-44`)]
    DF_matched2 = dt_matched2[,dplyr::select(.SD, YEAR, HIK, STATEFIP, STATE, COUNTYNHG, CONFEDERATE, UNION, BORDER, MALE, FEMALE, WHITE, NATIVEBORN)]
    DF_matched2 = merge(DF_matched2, cohort1to2, by = "HIK")
    
    # return matched cases
    return(list(DF_matched1, DF_matched2))
      
    }
    
    Linked_1850and1860 <- drop_false_matches(linked_1850and1860)
    nrow(Linked_1850and1860[[1]]) == nrow(Linked_1850and1860[[2]]) # must be TRUE
    Linked_1860and1870 <- drop_false_matches(linked_1860and1870)
    nrow(Linked_1860and1870[[1]]) == nrow(Linked_1860and1870[[2]]) # must be TRUE
    Linked_1870and1880 <- drop_false_matches(linked_1870and1880)
    nrow(Linked_1870and1880[[1]]) == nrow(Linked_1870and1880[[2]]) # must TRUE
    
    # remove old data
    rm(linked_1850and1860, linked_1860and1870, linked_1870and1880)
    
    # select true matches by key conditions
    select_linked_cases2a <- function(dt_ls) {
      # native-born AND white AND (age IN 5-44 if T1 OR age IN 15-54 if T2)
      dt1 = dt_ls[[1]][WHITE==1 & NATIVEBORN==1,][`5-14`==1 | `15-24`==1 |`25-34`==1 |`35-44`==1, ]
      dt2 = dt_ls[[2]][WHITE==1 & NATIVEBORN==1,][`15-24`==1 |`25-34`==1 |`35-44`==1 | `45-54`==1, ]
      return(list(dt1, dt2))
    }
    select_linked_cases2b = function(lc1, lc2) {
      hik_lkd = intersect(lc1$HIK, lc2$HIK)
      lc1_lkd = lc1[HIK %in% hik_lkd,]
      lc2_lkd = lc2[HIK %in% hik_lkd,]
      return(list(lc1_lkd,lc2_lkd))
    }
    
    Linked_1850and1860 <- select_linked_cases2a(Linked_1850and1860)
    Linked_1860and1870 <- select_linked_cases2a(Linked_1860and1870)
    Linked_1870and1880 <- select_linked_cases2a(Linked_1870and1880)
    
    LINKED_1850and1860 <- select_linked_cases2b(Linked_1850and1860[[1]], Linked_1850and1860[[2]])
    LINKED_1860and1870 <- select_linked_cases2b(Linked_1860and1870[[1]], Linked_1860and1870[[2]])
    LINKED_1870and1880 <- select_linked_cases2b(Linked_1870and1880[[1]], Linked_1870and1880[[2]])
    
    # further tidy ups
    tidyup_linked_census = function(ls_dt) {
      dt1 = ls_dt[[1]][order(HIK),dplyr::select(.SD, YEAR, HIK, STATEFIP, STATE, COUNTYNHG, CONFEDERATE, UNION, BORDER, MALE, FEMALE, WHITE, NATIVEBORN, `5-14`:`35-44`)]
      dt2 = ls_dt[[2]][order(HIK),dplyr::select(.SD, YEAR, HIK, STATEFIP, STATE, COUNTYNHG, CONFEDERATE, UNION, BORDER, MALE, FEMALE, WHITE, NATIVEBORN, `15-24`:`45-54`)]
      return(list(dt1, dt2))
    }
    
    LINKED_1850and1860 <- tidyup_linked_census(LINKED_1850and1860)
    LINKED_1860and1870 <- tidyup_linked_census(LINKED_1860and1870)
    LINKED_1870and1880 <- tidyup_linked_census(LINKED_1870and1880)
    
    # save cleaned linked samples
    LINKED_1850to80 = list("1850-1860" = LINKED_1850and1860, 
                           "1860-1870" = LINKED_1860and1870, 
                           "1870-1880" = LINKED_1870and1880)
    saveRDS(LINKED_1850to80, "XXX.RDS") # enter your preferred RDS filename here
    
    # clean-ups
    rm(list = setdiff(ls(), 'run_sample_code'))
    gc()
    
}

# SAMPLE CODE: produce national summaries, full counts 1850-1880 ----------

# The input is the extracted and cleaned full-count data covering 1850-1880 (as prepared above)
# The output is a set of XLSX files used as input data for the national analysis

if (run_sample_code) {
  
  # read full-count samples
  censuses = readRDS("XXX.RDS") # enter your chosen RDS filename from above
  
  census_1850 = censuses[YEAR==1850,]
  census_1860 = censuses[YEAR==1860,]
  census_1870 = censuses[YEAR==1870,]
  census_1880 = censuses[YEAR==1880,]
  
  # add demographic indicators
  add_new_indicators = function(dt) {
    # demographic indicators
    dt[, MALE := as.integer(SEX==1)][, FEMALE := as.integer(SEX==2)]
    dt[, WHITE := as.integer(RACE==1)][, BLACK:=as.integer(RACE==2)]
    # nativity (birthplace) indicators 
    dt[, NATIVEBORN := as.integer(BPL<100)][, FOREIGNBORN := as.integer(BPL>=100 & BPL !=999)] 
    # age group (5-year) indicators
    dt[, "0-4":=as.integer(AGE>=0&AGE<=4)][, "5–9":=as.integer(AGE>=5&AGE<=9)][, "10–14":=as.integer(AGE>=10&AGE<=14)][, "15–19":=as.integer(AGE>=15&AGE<=19)]
    dt[, "20–24":=as.integer(AGE>=20&AGE<=24)][, "25–29":=as.integer(AGE>=25&AGE<=29)][, "30–34":=as.integer(AGE>=30&AGE<=34)][, "35–39":=as.integer(AGE>=35&AGE<=39)]
    dt[, "40–44":=as.integer(AGE>=40&AGE<=44)][, "45–49":=as.integer(AGE>=45&AGE<=49)][, "50–54":=as.integer(AGE>=50&AGE<=54)][, "55–59":=as.integer(AGE>=55&AGE<=59)]
    dt[, "60–64":=as.integer(AGE>=60&AGE<=64)][, "65–69":=as.integer(AGE>=65&AGE<=69)][, "70–74":=as.integer(AGE>=70&AGE<=74)][, "75–79":=as.integer(AGE>=75&AGE<=79)]
    dt[, "80–84":=as.integer(AGE>=80&AGE<=84)][, "85+":=as.integer(AGE>=85)]
  }
  
  add_new_indicators(census_1850)
  add_new_indicators(census_1860)
  add_new_indicators(census_1870)
  add_new_indicators(census_1880)
  
  # compute cohort size for native-born whites by sex for each census year
  compute_subpopulation_by_agegroup = function(cen, subpop = expression()) {
    cen_sub = cen[eval(subpop),]
    cen_sub_sum = cen_sub[, lapply(.SD,sum), .SDcols = `0-4`:`85+`]
    return(cen_sub_sum)
  }
  
  WM_1850 = compute_subpopulation_by_agegroup(census_1850, subpop = expression(MALE==1 & WHITE==1 & NATIVEBORN==1))
  WF_1850 = compute_subpopulation_by_agegroup(census_1850, subpop = expression(FEMALE==1 & WHITE==1 & NATIVEBORN==1))
  WM_1860 = compute_subpopulation_by_agegroup(census_1860, subpop = expression(MALE==1 & WHITE==1 & NATIVEBORN==1))
  WF_1860 = compute_subpopulation_by_agegroup(census_1860, subpop = expression(FEMALE==1 & WHITE==1 & NATIVEBORN==1))
  WM_1870 = compute_subpopulation_by_agegroup(census_1870, subpop = expression(MALE==1 & WHITE==1 & NATIVEBORN==1))
  WF_1870 = compute_subpopulation_by_agegroup(census_1870, subpop = expression(FEMALE==1 & WHITE==1 & NATIVEBORN==1))
  WM_1880 = compute_subpopulation_by_agegroup(census_1880, subpop = expression(MALE==1 & WHITE==1 & NATIVEBORN==1))
  WF_1880 = compute_subpopulation_by_agegroup(census_1880, subpop = expression(FEMALE==1 & WHITE==1 & NATIVEBORN==1))
  
  writexl::write_xlsx(
    list(
      '1850' = WM_1850,
      '1860' = WM_1860,
      '1870' = WM_1870,
      '1880' = WM_1880
    ),
    'DAT_Ncohort_nbwm_1850to1880.xlsx')
  
  writexl::write_xlsx(
    list(
      '1850' = WF_1850,
      '1860' = WF_1860,
      '1870' = WF_1870,
      '1880' = WF_1880
    ),
    'DAT_Ncohort_nbwf_1850to1880.xlsx')
  
  # compute cohort size for native-born whites by sex in 1860
  WM_1860_baseALL = compute_subpopulation_by_agegroup(
    census_1860, subpop = expression(MALE==1 & WHITE==1))[,c('10–14', '15–19', '20–24', '25–29', '30–34', '35–39', '40–44')]
  
  writexl::write_xlsx(
    WM_1860_baseALL, 
    'DAT_Ncohort_wm_1860.xlsx'
  )
  
  # remove data objects
  rm(list = setdiff(ls(), 'run_sample_code'))
  
  # force garbage collection
  gc()
  
}

# SAMPLE CODE: produce state-level summaries, full counts 1850-1880 -------

if (run_sample_code) {
  
  # read full-count samples
  full_1850to80 <- readRDS("XXX.RDS") # enter your chosen RDS filename above (for the further cleaned full counts)
  
  full_1850 <- full_1850to80$`1850`
  full_1860 <- full_1850to80$`1860`
  full_1870 <- full_1850to80$`1870`
  full_1880 <- full_1850to80$`1880`
  rm(full_1850to80) # remove the stacked full dataset
  
  # compute native-born white cohort sizes by sex and state
  count_cohort_by_sexstate = function(cen, males = TRUE) {
    if (males) {
      cen_sub = cen[MALE==1]
    } else {cen_sub = cen[FEMALE==1]}
    cen_sum = cen_sub[, lapply(.SD,sum), by=STATE, .SDcols = `5-14`:`45-54`]
    cen_sum = as.data.frame(cen_sum) 
    cen_sum = cen_sum[order(cen_sum$STATE),]
    return(cen_sum)
  }

  ncohort_male_1850 <- count_cohort_by_sexstate(full_1850)
  ncohort_male_1860 <- count_cohort_by_sexstate(full_1860)
  ncohort_male_1870 <- count_cohort_by_sexstate(full_1870)
  ncohort_male_1880 <- count_cohort_by_sexstate(full_1880)
  ncohort_males <- list("1850"=ncohort_male_1850, "1860"=ncohort_male_1860, 
                        "1870"=ncohort_male_1870, "1880"=ncohort_male_1880)
  
  ncohort_female_1850 <- count_cohort_by_sexstate(full_1850, males = FALSE)
  ncohort_female_1860 <- count_cohort_by_sexstate(full_1860, males = FALSE)
  ncohort_female_1870 <- count_cohort_by_sexstate(full_1870, males = FALSE)
  ncohort_female_1880 <- count_cohort_by_sexstate(full_1880, males = FALSE)
  ncohort_females <- list("1850"=ncohort_female_1850, "1860"=ncohort_female_1860, 
                          "1870"=ncohort_female_1870, "1880"=ncohort_female_1880)
  
  writexl::write_xlsx(
    ncohort_males,
    'DAT_Ncohort_nbwm_1850to1880_byState.xlsx'
  )
  writexl::write_xlsx(
    ncohort_females,
    'DAT_Ncohort_nbwf_1850to1880_byState.xlsx'
  )
  
  # compute state groupings (by region/battleside)
  state_group <- full_1880[, lapply(.SD,sum), by=STATE, .SDcols = c('UNION','BORDER','CONFEDERATE')] |>
    as.data.frame() |>
    dplyr::mutate(
      GROUP = dplyr::case_when(
        UNION>0~"Union",
        BORDER>0~"Border",
        CONFEDERATE>0~"Confederate"
      ) |> factor(levels = c("Union","Border","Confederate"))
    ) |>
    dplyr::select(STATE, GROUP) |>
    dplyr::arrange(GROUP, STATE)
  
  writexl::write_xlsx(
    state_group,
    'DICT_state_battleside.xlsx'
  )
  
  # clean-ups
  rm(list = setdiff(ls(), 'run_sample_code'))
  gc()
  
}

# SAMPLE CODE: produce state-level summaries, linked records 1850- --------

if (run_sample_code) {
  
  # read cleaned linked records
  linked_1850to80 <- readRDS("XXX.RDS") # enter your chosen RDS filename above (for linked samples)
  
  linked_1850to60 <- linked_1850to80$`1850-1860`
  linked_1860to70 <- linked_1850to80$`1860-1870`
  linked_1870to80 <- linked_1850to80$`1870-1880`
  rm(linked_1850to80) # remove full list of datasets
  
  # link census records (row-wise append)
  link_census_records <- function(cen_pair, sex = 1) { # 1 = male, 0 = female
    cen1 = cen_pair[[1]][MALE==sex,]
    cen2 = cen_pair[[2]][MALE==sex,]
    rec_t1 = cen1[,.(HIK, STATE_T1 = STATE, C1_T1 = `5-14`, C2_T1 = `15-24`, C3_T1 = `25-34`, C4_T1 =`35-44`)]
    rec_t2 = cen2[,.(HIK, STATE_T2 = STATE)]
    rec_t1to2 = data.table::merge.data.table(rec_t1, rec_t2, by = "HIK")
    return(rec_t1to2)
  }
  
  records_male_1850to60 <- link_census_records(linked_1850to60)
  records_male_1860to70 <- link_census_records(linked_1860to70)
  records_male_1870to80 <- link_census_records(linked_1870to80)
  
  # compute linked cohort sizes for native-born white males by state
  count_linked_t1 <- function(rec_linked, state_t1 = state_vec) {
    rec_linked = rec_linked[STATE_T1 %in% state_t1,]
    nrec_linked_t1 <- rec_linked[, lapply(.SD,sum), by=STATE_T1, .SDcols = `C1_T1`:`C4_T1`]
    nrec_linked_t1 <- as.data.frame(nrec_linked_t1)
    names(nrec_linked_t1) <- c("STATE","5-14","15-24","25-34","35-44")
    nrec_linked_t1 <- nrec_linked_t1[order(nrec_linked_t1$STATE),]
    return(nrec_linked_t1)
  }
  
  nrecords_male_1850to60 <- count_linked_t1(records_male_1850to60)
  nrecords_male_1860to70 <- count_linked_t1(records_male_1860to70)
  nrecords_male_1870to80 <- count_linked_t1(records_male_1870to80)
  
  nrecords_males <- list(
    "1850-60" = nrecords_male_1850to60,
    "1860-70" = nrecords_male_1860to70,
    "1870-80" = nrecords_male_1870to80
  )
  writexl::write_xlsx(
    nrecords_males,
    'DAT_Ncohort_nbwm_1850to1880_byState_Linked.xlsx'
  )
  
  # compute out-migration counts for native-born white males by state
  count_out_migration <- function(rec_linked, state_t1 = state_vec) {
    rec_linked = rec_linked[STATE_T1 %in% state_t1,]
    rec_out = rec_linked[STATE_T2!=STATE_T1,]
    nrec_out = rec_out[, lapply(.SD,sum), by=STATE_T1, .SDcols = `C1_T1`:`C4_T1`]
    nrec_out = as.data.frame(nrec_out)
    names(nrec_out) <- c("STATE","5-14","15-24","25-34","35-44")
    nrec_out <- nrec_out[order(nrec_out$STATE),]
    return(nrec_out)
  }
  
  nout_male_1850to60 <- count_out_migration(records_male_1850to60)
  nout_male_1860to70 <- count_out_migration(records_male_1860to70)
  nout_male_1870to80 <- count_out_migration(records_male_1870to80)
  
  nout_males <- list(
    "1850-60" = nout_male_1850to60,
    "1860-70" = nout_male_1860to70,
    "1870-80" = nout_male_1870to80
  )
  writexl::write_xlsx(
    nout_males,
    'DAT_Noutmig_nbwm_1850to1880_byState_Linked.xlsx'
  )
  
  # compute in-migration counts for native-born white males by state
  count_in_migration <- function(rec_linked, state_t2 = state_vec) {
    rec_linked = rec_linked[STATE_T2 %in% state_t2,]
    rec_in = rec_linked[STATE_T1!=STATE_T2,]
    nrec_in = rec_in[, lapply(.SD,sum), by=STATE_T2, .SDcols = `C1_T1`:`C4_T1`]
    nrec_in = as.data.frame(nrec_in)
    names(nrec_in) <- c("STATE","5-14","15-24","25-34","35-44")
    nrec_in <- nrec_in[order(nrec_in$STATE),]
    return(nrec_in)
  }
  
  nin_male_1850to60 <- count_in_migration(records_male_1850to60)
  nin_male_1860to70 <- count_in_migration(records_male_1860to70)
  nin_male_1870to80 <- count_in_migration(records_male_1870to80)
  
  nin_males <- list(
    "1850-60" = nin_male_1850to60,
    "1860-70" = nin_male_1860to70,
    "1870-80" = nin_male_1870to80
  )
  writexl::write_xlsx(
    nin_males,
    'DAT_Ninmig_nbwm_1850to1880_byState_Linked.xlsx'
  )
  
  # clean-ups 
  rm(list = setdiff(ls(), 'run_sample_code'))
  gc()
  
}

# END SCRIPT HERE
